#For Step 1
import folium # Import folium data
import geopandas as gpd # Import data visualisation functions
#For the rest of the phases
import pandas as pd # Import pandas package
import numpy as np # Import numpy package
import matplotlib.pyplot as plt # Import data visualation functions
import seaborn as sns # Import data visualation functions
import plotly.graph_objects as go # Import data visualation functions
import warnings as warnings # Import data visualation functions
import statsmodels.formula.api as smf # Import stats modelling package for OLS regression
from sklearn.model_selection import train_test_split # Import testing for model
from sklearn.preprocessing import StandardScaler # Import Standard Scaler to standardise data
Since Python Inc. is a luxury car dealer, we will identify its target group of high-income consumers who are more likely to purchase luxury cars. Hence, we select the US state with the highest median household income as it is a proxy for the average purchasing power of residents living in a particular area.
We utilised a combination of three datasets:
df = pd.read_csv('Income Dataset.csv')
stateslatlong = pd.read_csv('States Lat Long.csv')
stateslatlong.head()
df.head()
# Clean states lat long data to merge with household income
stateslatlong.columns = ['State', 'Latitude','Longitude']
namelist = stateslatlong['State'].tolist()
namelistnew = []
for i in namelist:
namelistnew.append(i.split(',')[0])
stateslatlong['State'] = namelistnew
#Rename Washington State to Washington for consistency across 2 datasets
stateslatlong.loc[(stateslatlong['State'] == 'Washington State'),'State'] = 'Washington'
#merge data sets
df_combined = pd.merge(left=df, right=stateslatlong, how = 'left' ,left_on='State', right_on='State')
# District of Columbia is missing but can remove because it is the old name of Washington
docindex = df_combined[(df_combined.State == 'District of Columbia')].index
docindex
df_combined = df_combined.drop(docindex).reset_index(drop = True)
df_combined.head()
# Change the values of Median Household Income from str to float
emplist = []
for x in df_combined["Median Household Income"]:
y = x.replace("$","")
z = y.replace(",","")
emplist.append(float(z))
df_combined["Median Household Income"] = pd.Series(emplist)
df_combined.head()
# Insert the state codes into the dataset
codes = pd.read_csv('states.csv')
codes.head()
#Remove District of Columbia from this code dataset
docindex = codes[(codes.State == 'District of Columbia')].index
docindex
codes = codes.drop(docindex).reset_index(drop = True)
#Merging 3 datasets
df_combined.insert(0, "Abbreviation", codes["Abbreviation"], allow_duplicates=False)
df_combined.head()
poly = gpd.read_file('us-states.json')
m = folium.Map(location=[44.788689, -95.501507], zoom_start=3.5)
for i in range(0, len(df_combined)):
folium.Marker(df_combined.iloc[i,3:5].tolist(), popup='Median Household Income in ' + '<b>' +\
df_combined.iloc[i,1] + '<b>'+ '<br>' + '<br>'+ '<b>' + str(df_combined.iloc[i,2]) +'<b>').add_to(m)
for i in range(0, len(poly)):
folium.GeoJson(poly['geometry'][i]).add_to(m)
m
#plotting chloropleth graph
x = df_combined
fig = go.Figure(data=go.Choropleth(
locations=x['Abbreviation'], # Spatial coordinates
z = x['Median Household Income'].astype(float), # Data to be color-coded
locationmode = 'USA-states', # set of locations match entries in `locations`
colorscale = 'Reds',
colorbar_title = "Millions USD",
))
fig.update_layout(
title_text = 'Median Household Income of USA States',
geo_scope='usa', # limite map scope to USA
)
fig.show()
# plotting median household income across US states
df = pd.DataFrame({'group':df_combined["State"], 'values':df_combined["Median Household Income"] })
# Reorder it following the values:
ordered_df = df.sort_values(by='values')
my_range=range(1,len(df.index)+1)
# The vertival plot is made using the hline function
sns.set(style='white', font='Arial',font_scale=1.3)
plt.figure(figsize=(18,20))
color = ['skyblue' if (x < max(ordered_df['values']))
else 'red' for x in ordered_df['values']]
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['values'], color=color)
plt.plot(ordered_df['values'], my_range, "o", color="darkblue")
# Add titles and axis names
plt.yticks(my_range, ordered_df['group'], size=12)
plt.title("Median Household Income of US States", loc='center',size=24, fontweight="bold")
plt.xlabel('Median Household Income',size=20)
plt.ylabel('US States',size=20)
plt.show()
In Maryland, 31.8 out of 100 residents own a registered car. Assuming median household income and vehicle ownership per capita does not fluctatuate much across various cities in Maryland, we select the city in Maryland with the highest population as it translates into the highest number of vehicle owners in the state. We also consider population density of each city in Maryland to determine the estimated number of vehicle owners per unit area of land.
We use a separate dataset indicating the population size and population density of each city in Maryland.
cities = pd.read_csv('Maryland Cities Population.csv')
cities.head()
#change the values of population from str to float
emplist1 = []
for x in cities["Population"]:
z = x.replace("$","")
zz = z.replace(",","")
emplist1.append(float(zz))
cities["Population"] = pd.Series(emplist1)
cities.head()
pop_d = pd.read_csv('pop_density.csv')
pop_d
#change the values of population from str to float
emplist2 = []
for x in pop_d["Land Area in Km^2"]:
zz = float(x)
emplist2.append(zz)
pop_d["Land Area in Km^2"] = pd.Series(emplist2)
pop_d.head()
top10bypopulation = cities.copy()
top10bypopulation = top10bypopulation.head(10)
top10bypopulation['Land Area in km^2'] = pop_d["Land Area in Km^2"]
top10bypopulation['Population Density (km^2)'] = top10bypopulation['Population']/top10bypopulation['Land Area in km^2']
top10bypopulation
#Plotting
sns.set(style='white', font='Arial',font_scale=1.3)
fig,ax = plt.subplots(figsize=(20,5))
d = top10bypopulation['Population']
color = ['grey' if (x < max(d)) else '#E83D3D' for x in d ]
ax.bar(top10bypopulation['City'], top10bypopulation['Population'], width = 0.8, color = color,alpha=0.8)
ax2 = ax.twinx()
sns.lineplot(x = top10bypopulation['City'] , y= top10bypopulation['Population Density (km^2)'], ax = ax2, color = 'blue')
#Labelling and aesthetics
ax.set_xlabel('Cities in Maryland',fontsize=17)
ax.set_ylabel('Population',fontsize=17)
ax2.set_ylabel('Population Density (km^2)',fontsize=15)
plt.title('Population of Cities in Maryland',fontsize=20,fontweight="bold")
plt.xticks(fontsize=13, rotation=90)
plt.show()
As luxury cars are in a niche market, consumers are more likely to travel a longer distance to purchase cars from a luxury car dealership. This means that our client will still be able to capture the consumer base farther away from its dealership location, which diminishes the impact of its slightly lower population density. Furthermore, given that Baltimore has the largest land area, its residents have a greater need for a car as a means of transport around the city. Thus, we overweight the larger population size compared to the slightly lower population density as it signifies a more substantial luxury car consumer base.
Since Maryland is the state with the highest median household income in the US, coupled with the fact that Baltimore has the highest vehicle ownership, we infer that there is a sizable market for luxury cars in Baltimore, Maryland. Hence, we would recommend for Python Inc. to consider setting up operations there.
For the rest of this report, we will be analysing US car sales and will be preparing our sample data for the relevant analysis
We utilised one key dataset:
data = pd.read_csv('Car_Sales (US).csv')
print('NaN values in each column:')
print(data.isna().sum()) # Data before cleaning
data = pd.read_csv('Car_Sales (US).csv')
def clean_dataset(df):
df.dropna(inplace=True) # Remove NaN values
df = df.loc[df['Transmission_Type']!='UNKNOWN'] # Remove "UNKNOWN" values for "Transmission_Types"
df = df.loc[df['Engine_Cylinders']!=0] # Remove "0" values for "Engine_Cylinders"
return df
data_cleaned = clean_dataset(data)
print('NaN values in each column:')
print(data_cleaned.isna().sum())
# To find out the total number of cars left in our sample
total_rows = len(data_cleaned.axes[0])
print('No of rows: '+ str(total_rows))
# Creating No. of transactons for each brand
car_sales = pd.DataFrame(data_cleaned['Make'].value_counts())
car_sales.columns =['Number of cars'] # Renaming the column index
car_sales.head()
car_brands = list(data_cleaned['Make'].unique()) # Creating a list of each of the car_brands
avg_brand_MSRP = [] # Empty list for appending later
sales_quantity = [] # Empty list for appending later
sales_quantity_perc = [] # Empty list for appending later
car_popularity = []
for item in car_brands:
"""This section is for finding the mean MSRP for each car brand"""
brand_selected = data_cleaned['Make'] == item # Isolate the rows to the specific car brand
brand_mean = data_cleaned.loc[brand_selected]['MSRP'].mean() # Finding the mean MSRP for the car brand
avg_brand_MSRP.append(brand_mean) # Appending mean MSRP for the Brand to the list
"""This section is for finding the number of cars per brand in this sample"""
sales_quantity.append(car_sales.loc[item]['Number of cars']) # The quantity of each type of car sold
sales_quantity_perc.append((car_sales.loc[item]['Number of cars'] / total_rows) * 100 ) # Quantity as a percentage
"""Car Popularity"""
car_popularity.append(data_cleaned.loc[brand_selected]['Popularity'].mean())
Brand_MSRP = pd.DataFrame() # Creation of empty data frame
Brand_MSRP['Brand'] = car_brands
Brand_MSRP['Average MSRP'] = avg_brand_MSRP
Brand_MSRP['Quantity Sold'] = sales_quantity
Brand_MSRP['Quantity Sold (% of total)'] = sales_quantity_perc
Brand_MSRP['Popularity'] = car_popularity
Brand_MSRP.head()
With the creation of data frames for average MSRP, popularity, and quantity sold, we now have the relevant metrics to determine which luxury brands to recommend for Python Inc.
For our data set, we are using price to segment all car brands int 3 types of cars that US consumers will buy, namely:
#With average MSRP of all brands, we sort into 3 bands
sorted_msrp = Brand_MSRP.sort_values(by= ['Average MSRP'],inplace=False)
#preparing data
xaxislabels = sorted_msrp['Brand']
#plotting
sns.set(style='white', font='Arial',font_scale=1)
plt.figure(figsize=(18,4))
plt.bar(xaxislabels,sorted_msrp['Average MSRP'], color= '#db991a', alpha = 0.6,edgecolor='black')
plt.scatter(xaxislabels,sorted_msrp['Average MSRP'], marker='x', c= '#633193', alpha = 1)
#Labelling and aesthetics
plt.xlabel('Car Brand', fontsize = 15)
plt.xticks(rotation = 90)
plt.title('Car Brands and their average MSRP',fontsize=18, fontweight="bold")
plt.ylabel('Average MSRP' , fontsize = 15)
plt.show()
We keep in mind that Bugatti's price is considered ultra-luxury as their average prices are extremely high
#Adding the cateogories into Sorted_MSRP dataframe
sorted_msrp["Car Category"] = "budget"
is_middle = (sorted_msrp['Average MSRP'] > 40000) & (sorted_msrp['Average MSRP'] <100000)
is_luxury = sorted_msrp['Average MSRP'] >= 100000
sorted_msrp.loc[is_middle,"Car Category"] = 'mid-tier'
sorted_msrp.loc[is_luxury,"Car Category"] = 'luxury'
sorted_msrp.head()
#preparing data
groups = sorted_msrp.groupby("Car Category",sort=False)
colors = ['#62411f','#f68e43','#221dbc'] #our desired colors
#plotting
sns.set(style='white', font='Arial',font_scale=1)
plt.figure(figsize=(18,4))
i = 0
for name, group in groups:
plt.bar(group["Brand"], group["Average MSRP"],color = colors[i],label=name, edgecolor='black')
i = i + 1
#Labelling and aesthetics
plt.xlabel('Car Brand', fontsize = 15)
plt.xticks(fontsize= 12, rotation = 90)
plt.title('Car Brands in each category',fontsize=18, fontweight='bold')
plt.ylabel('Average MSRP' , fontsize = 15)
plt.legend()
plt.show()
# We filter out the cars to only luxury brands, and then select top 3 brands to analyse
luxury_only = sorted_msrp.loc[sorted_msrp['Car Category'] == 'luxury']
#sorting brands by popularity
pop_sorted = luxury_only.sort_values(by= ['Popularity'],inplace=False)
pop_sorted.head()
#preparing data
top5luxpop = pop_sorted['Popularity'] > 500
xaxislabels = pop_sorted['Brand']
pop_adjusted = pop_sorted['Popularity'].loc[top5luxpop]
xaxislabels_adjusted = xaxislabels.loc[top5luxpop]
#plotting
sns.set(style='white', font='Arial',font_scale=1)
plt.figure(figsize=(15,4))
plt.plot(xaxislabels, pop_sorted['Popularity'] , c= 'b', alpha = 0.7,
label = 'Popularity', linewidth = 1.25, linestyle = '--')
plt.scatter(xaxislabels_adjusted, pop_adjusted, c= 'r', alpha = 1, marker = "*", label ="Top 5 Brands", s = 100)
plt.fill_between(xaxislabels_adjusted, pop_adjusted, alpha = 0.15 , color = 'g')
#Labelling and aesthetics
plt.xlabel('Brand', fontsize = 14)
plt.xticks(rotation = 0)
plt.ylabel('Popularity' , fontsize = 14)
plt.title('Luxury car brands sorted by popularity',fontsize=18, fontweight='bold')
plt.legend()
plt.show()
From the above, we narrow down our selection to the 5 most popular luxury brands: Bently, Bugatti, Lamborghini, Porsche and Ferrari.
#Sorting dataframe by quantity
qty_sorted = luxury_only.sort_values(by= ['Quantity Sold (% of total)'],inplace=False)
qty_sorted.head()
#preparing data
xaxislabels = qty_sorted['Brand']
#plotting
sns.set(style='white', font='Arial',font_scale=1)
plt.figure(figsize=(15,4))
plt.bar(xaxislabels, qty_sorted['Quantity Sold (% of total)'], color= 'r', alpha = 0.5, edgecolor='black')
#Labelling and aesthetics
plt.xlabel('Luxury Car Brand', fontsize = 12)
plt.ylabel('Quantity Sold (% of total)' , fontsize = 12)
plt.title('Luxury car brands sorted by quantity',fontsize=16, fontweight= 'bold')
plt.show()
#Preparing data
data_pie2 = qty_sorted['Quantity Sold']
brands_pie2 = qty_sorted['Brand']
#Plotting
fig = go.Figure(data=[go.Pie(labels=brands_pie2, values=data_pie2, textinfo='label+percent',
insidetextorientation='radial')])
#Labelling and aesthetics
fig.update_traces(hole=.3, hoverinfo="label+percent+name")
fig.update_layout(title_text="Proportion of Luxury cars Sold for each Brand")
fig.show()
Next, by comparing quantity sold across all brands, we determine the top 5 brands sold to be: Maserati, Ferrari, Bently, Aston Martin and Porsche
By corroborating our findings for the top 5 brands in both popularity and quantity sold, we can determine the top 3 car brands in the luxury car market.
#Top 5 Cars by Popularity
top_5_pop = pop_sorted[len(pop_sorted)-5:len(pop_sorted)]
top_5_pop = list(top_5_pop['Brand'])
print('The most popular luxury brands are: '+ str(top_5_pop))
#Top 5 Cars by Quantity
top_5_qty = qty_sorted[len(qty_sorted)-5:len(qty_sorted)]
top_5_qty = list(top_5_qty['Brand'])
print('The most sold luxury brands are: '+ str(top_5_qty))
#Finding intersection of the 2 data frames
top_3_brands = []
for i in luxury_only['Brand']:
if i in top_5_qty and i in top_5_pop:
top_3_brands.append(i)
else:
continue
top_3_brands
#Preparing data
brands = pop_sorted['Brand']
x = pop_sorted['Quantity Sold']
y = pop_sorted['Popularity']
z = pop_sorted['Average MSRP']
z= z/200
#Plotting
sns.set(style='white', font='Arial',font_scale=1.3)
plt.figure(figsize=(15,6))
# Change color with c and alpha. I map the color to the X axis value.
plt.scatter(x, y, s=z, c=y, cmap="summer_r", alpha=0.4, edgecolors="grey", linewidth=2)
# zip joins x,y coordinates in pairs with brand names
for i,j,k in zip(x,y,brands):
plt.annotate(k, # this is the text
(i,j), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,0), # distance from text to points (x,y)
ha='center',fontsize= 9,fontweight= 'bold') # horizontal alignment can be left, right or center
plt.ylim(-600, 3050)
plt.xlim(-20,150)
plt.xlabel("Quantity Sold", fontsize= 12)
plt.ylabel("Popularity", fontsize= 12)
plt.title('Popularity of Brand v.s. Quantity Sold' , fontsize = 18, fontweight= 'bold')
plt.show()
To determine what are the attributes for luxury cars which correlates to higher MSRP so we can set higher prices hence more profits for the brands chosen
#Dataframe for luxury brands and their attributes
luxury_brands = ['Porsche', 'Maserati', 'Aston Martin', 'Spyker',
'Ferrari', 'McLaren', 'Bentley', 'Lamborghini', 'Rolls-Royce', 'Maybach', 'Bugatti']
is_luxury = data_cleaned['Make'].isin(luxury_brands)
is_luxury
luxury_data = data_cleaned.loc[is_luxury]
luxury_data.head()
#Preparing data
luxury_corr = luxury_data.corr()
#Plotting
sns.set(style='white', font='Arial',font_scale=1.3)
f,ax=plt.subplots(figsize=(11,6))
sns.heatmap(luxury_corr, cmap='rocket_r', annot=True, linewidths=.3)
#Labelling and aesthetics
plt.title("Correlation between features (Luxury Cars)",
weight='bold',
fontsize=18)
plt.show()
To find variables that affected price the most, we considered the variables with a Pearson's coefficient larger than 0.500 in absolute terms. As Seen above, 'Engine_HP' and 'Engine_Cylinders' correlates the most to MSRP.
However, looking at the correlation coefficient is insufficient to conclude whether a variable such as the engine horsepower can affect the MSRP for luxury cars. Hence, we will run a multiple regression analysis to confirm this relationship.
#OLS, Multiple Linear Regression
model = smf.ols('MSRP ~ Engine_HP + Year + C(Transmission_Type) + C(Driven_Wheels) + C(Vehicle_Size) + C(Vehicle_Style) \
+ C(Engine_Fuel_Type) + Popularity + City_MPG + Highway_MPG + \
Engine_Cylinders + Number_of_Doors', data=luxury_data)
result = model.fit()
print(result.summary())
pvalues = result.pvalues
print('\n'*2)
coeff_var = []
pvalues = result.pvalues
coeff = result.params
coeff.sort_values(inplace=True)
for i in coeff.index:
if pvalues[i] < 0.05:
coeff_var.append(i)
print('Top variables ranked by coefficients and Low P-Values:\n')
print(result.params[coeff_var])
From our original regression model, we remove Driven Wheels, Transmission_Type, City_MPG, and Highway_MPG as they fail our hypothesis test at 5% level of significance, hence meaning their beta-hat is likely to be zero
model = smf.ols('MSRP ~ Engine_HP + Year + C(Vehicle_Size) + C(Vehicle_Style) + C(Engine_Fuel_Type) \
+ Popularity + Engine_Cylinders + Number_of_Doors', data=luxury_data)
result_2 = model.fit()
print(result_2.summary())
Conclusion: Looking at p-values, for p-value > 0.05, we drop Vehicle size 'Midsize', Vehicle Style 'Couple' and 'Sedan', Engine Fuel Type (Regular Unleaded) and Popularity, as they fail our hypothesis test at 5% level of significance
df = luxury_data
# Modify data frame by adding dummy variables
df2 = pd.concat([df, pd.get_dummies(df['Vehicle_Size'].astype('category'), prefix = 'd')], axis = 1)
df2 = pd.concat([df2, pd.get_dummies(df2['Vehicle_Style'].str.replace(' ','_').astype('category'), prefix = 'd')], axis = 1)
df2 = pd.concat([df2, pd.get_dummies(df2['Engine_Fuel_Type'].str.replace(' ','_').
str.replace('-','_').str.replace('(', '').str.replace(')','').
str.replace('/', '_').astype('category'), prefix = 'd')], axis = 1)
# Regression model
model = smf.ols('MSRP ~ Engine_HP + Year + d_Large + d_4dr_SUV + d_Sedan + d_flex_fuel_premium_unleaded_required_E85 \
+ d_premium_unleaded_required + Engine_Cylinders + Number_of_Doors', data=df2)
result_3 = model.fit()
print(result_3.summary())
Conclusion: Looking at our third run, we can eliminate premium unleaded (required) as well.
As seen below, removing premium unleaded (required) does not vary R^2 but causes all p-values to be acceptable within 5% level of significance
# Regression model
model = smf.ols('MSRP ~ Engine_HP + Year + d_Large + d_4dr_SUV + d_Sedan \
+ d_flex_fuel_premium_unleaded_required_E85 + Engine_Cylinders + Number_of_Doors', data=df2)
result_4 = model.fit()
print(result_4.summary())
#Preparing data
coefficients = pd.DataFrame(result_4.params[1:]) # Estimates of model parameters, after removing 'intercept'
coefficients[0] = StandardScaler().fit_transform(coefficients)
coefficients['colors'] = ['red' if x < 0 else 'green' for x in coefficients[0]]
coefficients = coefficients.sort_values(by= [0],inplace=False)
#Plotting
sns.set(style='darkgrid', font_scale=1.3, font="Arial",
rc={'axes.facecolor':'#d8d8d8'})
plt.figure(figsize=(14,10), dpi= 80)
plt.hlines(y=coefficients.index, xmin=0, xmax=coefficients[0], color=coefficients.colors, alpha=0.7, linewidth=5)
#Labelling and aesthetics
plt.yticks(coefficients.index, fontsize=15)
plt.ylabel('Attributes', fontsize=16)
plt.xlabel('Regression Coefficient', fontsize=16)
plt.title('Standardized Coefficients of Regression Model Attributes', fontdict={'size':22}, fontweight = 'bold')
plt.grid(linestyle='--', alpha=0.7)
plt.show()
From this graph, we can see that out of our 8 variables, 4-door SUVs and Sedans actually negatively affect price while the rest are variables that affect price.
The train-test split procedure is used to estimate the performance of machine learning algorithms when they are used to make predictions on data not used to train the model.
train, test = train_test_split(df2,
train_size=0.75,
test_size=0.25) # Split the data set by a 75/25 ratio
# Specify the model based on the variable selection
model ='MSRP ~ Engine_HP + Year + d_Large + d_4dr_SUV + d_Sedan \
+ d_flex_fuel_premium_unleaded_required_E85 + Engine_Cylinders + Number_of_Doors'
# Fit the model to identify the coefficients
result = smf.ols(model,data=train).fit()
# Use the fitted model to make predictions
amt_hat = result.predict(test)
print('Summary of MSRP (Luxury Cars):')
print('Mean: ',df2['MSRP'].mean())
print('Max: ',df2['MSRP'].max())
print('Min: ',df2['MSRP'].min(),'\n')
rmse = ((test['MSRP'] - amt_hat)**2).mean() ** 0.5
print('Root Mean Square Error = {0:0.4f}'.format(rmse))
print('Normalised Root Mean Square Error = ', rmse/(df2['MSRP'].max()-df2['MSRP'].min()))
A low Normalised Root Mean Square Error of < 0.1 confirms that our model is quite accurate in predicting price.
Specifically, we have identified a total of 8 variables that are significant determinants of MSRP. 4 of them are continuous variables while 4 of the other are categorical variables.
The variables are: Engine Horsepower, Year, Vehicle Size(Large), Vehicle Style (4-drive SUV and Sedan), Engine Fuel Type (Flex-fuel (premium unleaded required/E85)), Engine Cylinders, and Number of Doors
For our target 3 brands, there will be some categorical variables, ie Vehicle Style, that may not be present in those brands. As such, we will try to draw links to the categorical variables and continuous variables that show trends in our target brands
We will now determine how the chosen attributes affect our chosen brands
#Data containing the 3 brands we have identified earlier
top = (data_cleaned['Make'] == 'Bentley') | (data_cleaned['Make'] == 'Porsche') | (data_cleaned['Make'] == 'Ferrari')
top_3 = data_cleaned.loc[top]
top_3.head()
#We will use the 3 variable names below to refer to the slice of dataset concerning cars of that brand
Bentley = top_3.loc[top_3['Make']=='Bentley',:]
Porsche = top_3.loc[top_3['Make']=='Porsche',:]
Ferrari = top_3.loc[top_3['Make']=='Ferrari',:]
print('Number of Cars for chosen Brands\n')
print('Bentley:',Bentley.shape[0])
print('Porsche:',Porsche.shape[0])
print('Ferrari:',Ferrari.shape[0])
#Preparing data
violin_plot = ['MSRP','Year','Make']
bentley = Bentley.loc[:,violin_plot]
porsche = Porsche.loc[:,violin_plot]
ferrari = Ferrari.loc[:,violin_plot]
#Plotting
MSRP_YEAR_MAKE = pd.concat([bentley,porsche,ferrari])
MSRP_YEAR_MAKE = MSRP_YEAR_MAKE[MSRP_YEAR_MAKE['Year'].isin([x for x in range(2012,2018)])] #MSRP vs Make (2012 to 2017)
print('Range of years: [{0:0.0f}, {1:0.0f}]'.format(MSRP_YEAR_MAKE['Year'].min(),MSRP_YEAR_MAKE['Year'].max()))
sns.set(style='darkgrid', font_scale=1.3, font="Arial",
rc={'axes.facecolor':'#d8d8d8'})
ax = sns.violinplot(x="Make", y="MSRP", data=MSRP_YEAR_MAKE, palette = 'pastel')
#Labelling and aesthetics
ax.set_xlabel('Brand',fontsize=14)
ax.set_ylabel('MSRP',fontsize=14)
plt.title('MSRP Distribution for chosen Brands', fontsize=16, fontweight= 'bold')
plt.show()
Boxplot showing Horsepower distribution for chosen brands
#Preparing data
df = top_3 #to simplify reference to luxury data
#Plotting
plt.figure(figsize = (10,4))
sns.set(style='darkgrid', font_scale=1.3, font="calibri",
rc={'axes.facecolor':'#d8d8d8'})
sns.boxplot(x="Engine_HP", y="Make", data=df)
#Labelling and aesthetics
plt.title("Distribution of Horsepower for each Make",fontsize=18, fontweight= 'bold')
plt.show()
Scatter plot + regression line showing MSRP against Engine_HP for chosen brands
#Preparing data
cn = ['MSRP','Engine_HP']
ferrari = Ferrari.loc[:,cn]
porsche = Porsche.loc[:,cn]
bentley = Bentley.loc[:,cn]
df2 = pd.concat([ferrari,porsche,bentley],axis = 0)
min_HP = df2.loc[:,'Engine_HP'].min()
max_HP = df2.loc[:,'Engine_HP'].max()
x = np.linspace(min_HP, max_HP, 100, 0)
#linear regression coefficients
Da,Db,Dc = np.polyfit(df2['Engine_HP'], df2['MSRP'], 2) #Overall
Ia,Ib,Ic = np.polyfit(ferrari['Engine_HP'], ferrari['MSRP'], 2) #Ferrari
Wa,Wb,Wc = np.polyfit(porsche['Engine_HP'], porsche['MSRP'], 2) #Porsche
Ga,Gb,Gc = np.polyfit(bentley['Engine_HP'], bentley['MSRP'], 2) #Bentley
#Plotting
sns.set(style='darkgrid', font_scale=1.3, font="calibri",
rc={'axes.facecolor':'#d8d8d8'})
plt.figure(figsize = (13,10))
plt.scatter(ferrari['Engine_HP'], ferrari['MSRP'],marker='.',alpha=0.5,label='Ferrari')
plt.plot(x,Ia*x**2+Ib*x+Ic,linewidth = 4,label='Ferrari')
plt.scatter(porsche['Engine_HP'], porsche['MSRP'],marker='.',alpha=0.5,label='Porsche')
plt.plot(x,Wa*x**2+Wb*x+Wc,linewidth = 4,label='Porsche')
plt.scatter(bentley['Engine_HP'], bentley['MSRP'],marker='.',alpha=0.5,label='Bentley')
plt.plot(x,Ga*x**2+Gb*x+Gc,linewidth = 4,label='Bentley')
plt.plot(x,Da*x**2+Db*x+Dc,linewidth = 4,label='Overall')
#Labelling and aesthetics
plt.title('MSRP vs Engine Horsepower', fontsize=20, fontweight= 'bold')
plt.legend(fontsize=12)
plt.xlabel('Engine Horsepower', fontsize=15)
plt.ylabel('MSRP', fontsize=15)
plt.show()
#Statistics
print('Dataset contains target brands only\n')
lin_model = smf.ols('MSRP ~ Engine_HP',df2)
lin_result = lin_model.fit()
print('Linear Regression:')
print('Adj. R-squared: {0:0.10f}\n'.format(lin_result.rsquared_adj))
quad_model = smf.ols('MSRP ~ Engine_HP + np.square(Engine_HP)',df2)
quad_result = quad_model.fit()
print('Quadratic Regression:')
print('Adj. R-squared: {0:0.10f}\n'.format(quad_result.rsquared_adj))
if quad_result.rsquared_adj>lin_result.rsquared_adj:
print('Quadratic Regression is more suitable.')
#Preparing data
cn = ['MSRP','Engine_Cylinders']
ferrari = Ferrari.loc[:,cn]
porsche = Porsche.loc[:,cn]
bentley = Bentley.loc[:,cn]
df2 = pd.concat([ferrari,porsche,bentley],axis = 0)
min_Engine_Cylinders = df2.loc[:,'Engine_Cylinders'].min()
max_Engine_Cylinders = df2.loc[:,'Engine_Cylinders'].max()
x = np.linspace(min_Engine_Cylinders, max_Engine_Cylinders, 100, 0)
#linear regression coefficients
Da,Db,Dc = np.polyfit(df2['Engine_Cylinders'], df2['MSRP'], 2) #Overall
Ia,Ib,Ic = np.polyfit(ferrari['Engine_Cylinders'], ferrari['MSRP'], 2) #Ferrari
Wa,Wb,Wc = np.polyfit(porsche['Engine_Cylinders'], porsche['MSRP'], 2) #Porsche
Ga,Gb,Gc = np.polyfit(bentley['Engine_Cylinders'], bentley['MSRP'], 2) #Bentley
#Plotting
sns.set(style='darkgrid', font_scale=1.3, font="calibri",
rc={'axes.facecolor':'#d8d8d8'})
plt.figure(figsize = (13,10))
plt.scatter(ferrari['Engine_Cylinders'], ferrari['MSRP'],marker='.',alpha=0.5,label='Ferrari')
plt.plot(x,Ia*x**2+Ib*x+Ic,linewidth = 4,label='Ferrari')
plt.scatter(porsche['Engine_Cylinders'], porsche['MSRP'],marker='.',alpha=0.5,label='Porsche')
plt.plot(x,Wa*x**2+Wb*x+Wc,linewidth = 4,label='Porsche')
plt.scatter(bentley['Engine_Cylinders'], bentley['MSRP'],marker='.',alpha=0.5,label='Bentley')
plt.plot(x,Ga*x**2+Gb*x+Gc,linewidth = 4,label='Bentley')
plt.plot(x,Da*x**2+Db*x+Dc,linewidth = 4,label='Overall')
#Labelling and aesthetics
plt.title('MSRP vs Engine Cylinders', fontsize=20, fontweight= 'bold')
plt.legend(fontsize=12)
plt.xlabel('Engine_Cylinders', fontsize=15)
plt.ylabel('MSRP', fontsize=15)
warnings.filterwarnings("ignore")
plt.show()
#Statistics
print('Dataset contains target brands only\n')
lin_model = smf.ols('MSRP ~ Engine_Cylinders', df2)
lin_result = lin_model.fit()
print('Linear Regression:')
print('Adj. R-squared: {0:0.10f}\n'.format(lin_result.rsquared_adj))
quad_model = smf.ols('MSRP ~ Engine_Cylinders + np.square(Engine_Cylinders)',df2)
quad_result = quad_model.fit()
print('Quadratic Regression:')
print('Adj. R-squared: {0:0.10f}\n'.format(quad_result.rsquared_adj))
if quad_result.rsquared_adj>lin_result.rsquared_adj:
print('Quadratic Regression is more suitable.')
The categorical variables are:
#Preparing data
Bentley = top_3.loc[top_3['Make']=='Bentley',:]
Porsche = top_3.loc[top_3['Make']=='Porsche',:]
Ferrari = top_3.loc[top_3['Make']=='Ferrari',:]
cat1 = ('Vehicle_Size', 'Large')
cat2 = ('Engine_Fuel_Type','flex-fuel (premium unleaded required/E85)')
cat3 = ('Vehicle_Style', 'Sedan')
cat4 = ('Vehicle_Style', '4dr SUV')
categories = [cat1,cat2,cat3,cat4]
brands = [Bentley, Porsche, Ferrari]
brand_name = ['Bentley', 'Porsche', 'Ferrari']
ans = [[],[],[],[]]
r = 0
for i in categories:
res = ans[r]
for name in brands:
n = name[i[0]].value_counts().get(i[1]) #n gives the count according to cat for each brand
if n is None:
n = 0
res.append(n)
r = r+1
#Plotting
sns.set(style='darkgrid', font_scale=1.3, font="calibri",
rc={'axes.facecolor':'#d8d8d8'})
fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(15, 10))
ax = ax.reshape(-1)
titles = ['Qty of Cars that are Large per brand',
'Qty of Cars with flex-fuel (premium unleaded required/E85) per brand',
'Qty of Sedans per brand','Qty of 4 door SUVs per brand']
colors = ['#6497b1', '#35b899', '#b3c0e2','#ca5e27']
for i,j in enumerate(ax[:4]):
j.bar(brand_name,ans[i],color = colors[i], edgecolor='black')
j.title.set_text(titles[i])
j.set_ylabel('Number of Cars')
#Labelling and aesthetics
plt.suptitle("Barplots showing Distribution of Significant Categorical Variables for each Brand",
fontsize=20, fontweight= 'bold')
plt.tight_layout()
plt.subplots_adjust(top = 0.9)
plt.show()
plt.show()
print('Range of years: [{0:0.0f}, {1:0.0f}]'.format(MSRP_YEAR_MAKE['Year'].min(),MSRP_YEAR_MAKE['Year'].max()))
#Plotting
plt.figure(figsize = (10,4))
ax = sns.barplot(x="Make", y="MSRP", hue="Year", data=MSRP_YEAR_MAKE)
#Labelling and aesthetics
ax.set_xlabel('Brand',fontsize=14)
ax.set_ylabel('MSRP',fontsize=14)
plt.title('Prices of cars for each brand by year',fontsize=18, fontweight= 'bold')
ax.legend(loc='upper center', frameon=False)
plt.show()
As seen in the figure, from 2012 to 2017, expect for Porsche, the latest models indicate a higher MSRP. Hence, Python Inc. should sell newer car models for Bentley and Ferarri at a higher price.
Additional Graphs that may supplement our project
#Preparing data
xaxislabels = pop_sorted['Brand']
#Plotting
plt.figure(figsize=(15,4))
plt.plot(xaxislabels, pop_sorted['Popularity'] / 1000, c= 'b', alpha = 0.5, marker ='x', label = 'Popularity')
plt.plot(xaxislabels, pop_sorted['Quantity Sold (% of total)'], c= 'r', alpha = 0.5, marker = 'o', label = 'Quantity Sold (% of total)')
#Labelling and aesthetics
plt.xlabel('Car Brand', fontsize = 12)
plt.xticks(rotation = 90)
plt.ylabel('Popularity & Quantity Sold (% of total)' , fontsize = 12)
plt.title('Popularity and Quantity for luxury brands', fontweight= 'bold')
plt.legend()
plt.show()
#### We filter out the cars to only luxury brands, and then select top 3 brands to analysexaxislabels = Brand_MSRP['Brand']
#Plotting
width = 0.3
xa = np.arange(len(xaxislabels))
plt.figure(figsize=(15,4))
plt.bar( xa -0.5 * width, pop_sorted['Popularity'] / 1000,
width = width, color= 'b', alpha = 0.7, label = 'Popularity')
plt.bar( xa+0.5 * width, pop_sorted['Quantity Sold (% of total)'],
width = width, color= 'r', alpha = 0.7, label = 'Quantity Sold (% of total)')
#Labelling and aesthetics
plt.xlabel('Car Brand', fontsize = 12)
plt.xticks(xa , xaxislabels ,rotation = 90)
plt.ylabel('Popularity & Quantity Sold (% of total)' , fontsize = 12)
plt.title('Popularity of Brand v.s. Quantity Sold (% of total)' , fontsize = 15, fontweight= 'bold')
plt.legend()
plt.show()
#Preparing data
bar_plot = ['MSRP','Number_of_Doors','Make']
bentley = Bentley.loc[:,bar_plot]
porsche = Porsche.loc[:,bar_plot]
ferrari = Ferrari.loc[:,bar_plot]
#Plotting
plt.figure(figsize = (15,5))
door_data = pd.concat([bentley,porsche,ferrari])
#Labelling and aesthetics
ax = sns.barplot(x="Number_of_Doors", y="MSRP", hue="Make", data=door_data)
ax.set_xlabel('Number of Doors',fontsize=14)
ax.set_ylabel('MSRP',fontsize=14)
plt.title('Group bar plot of no. of doors vs MSRP', fontweight= 'bold')
plt.show()
The Group bar plot shows that PYTHON INC should sell Bently cars with 2 doors since the MSRP would be higher. For Porsche, there is no significant difference. Lasly, for Ferrari, it only has cars which have 2 doors.
Bentley = top_3.loc[top_3['Make']=='Bentley',:]
Porsche = top_3.loc[top_3['Make']=='Porsche',:]
Ferrari = top_3.loc[top_3['Make']=='Ferrari',:]
Bentley_VSL = Bentley.loc[(Bentley['Vehicle_Size']=='Large') & (Bentley['Engine_Cylinders']==8.0)]
Porsche_VSL = Porsche.loc[(Porsche['Vehicle_Size']=='Large') & (Porsche['Engine_Cylinders']==8.0)]
Ferrari_VSL = Ferrari.loc[(Ferrari['Vehicle_Size']=='Large') & (Ferrari['Engine_Cylinders']==8.0)]
print('Vehicle Size(Large) per brand:')
print('Bently: ',Bentley_VSL['Vehicle_Size'].count())
print('Porsche: ', Porsche_VSL['Vehicle_Size'].count())
print('Ferrari: ', Ferrari_VSL['Vehicle_Size'].count())
#Preparing data
bar_plot_VS = ['MSRP','Vehicle_Size','Make']
bentley = Bentley.loc[:,bar_plot_VS]
porsche = Porsche.loc[:,bar_plot_VS]
ferrari = Ferrari.loc[:,bar_plot_VS]
#Plotting
plt.figure(figsize = (15,5))
VS_data = pd.concat([bentley,porsche,ferrari])
#Labelling and aesthetics
ax = sns.barplot(x="Vehicle_Size", y="MSRP", hue="Make", data=VS_data)
ax.set_xlabel('Vehicle Size',fontsize=14)
ax.set_ylabel('MSRP',fontsize=14)
plt.title('Group bar plot of vehicle sizes vs MSRP', fontweight= 'bold')
plt.show()
To test: Vehicle Style (4-drive Sedan and SUV)
#Preparing data
bar_plot_VS2 = ['MSRP','Vehicle_Style','Make']
bentley = Bentley.loc[:,bar_plot_VS2]
porsche = Porsche.loc[:,bar_plot_VS2]
ferrari = Ferrari.loc[:,bar_plot_VS2]
VS2_data = pd.concat([bentley,porsche,ferrari])
#Plotting
plt.figure(figsize = (15,5))
ax = sns.barplot(x="Vehicle_Style", y="MSRP", hue="Make", data=VS2_data)
#Labelling and aesthetics
ax.set_xlabel('Vehicle Style',fontsize=14)
ax.set_ylabel('MSRP',fontsize=14)
plt.title('Group bar plot of vehicle type vs MSRP', fontweight= 'bold')
plt.show()
To test: Engine Fuel Type (Flex-fuel and flex-fuel (premium unleaded required/E85)),
#Preparing data
bar_plot_FT = ['MSRP','Engine_Fuel_Type','Make']
bentley = Bentley.loc[:,bar_plot_FT]
porsche = Porsche.loc[:,bar_plot_FT]
ferrari = Ferrari.loc[:,bar_plot_FT]
FT_data = pd.concat([bentley,porsche,ferrari])
#Plotting
plt.figure(figsize = (15,5))
ax = sns.barplot(x="Engine_Fuel_Type", y="MSRP", hue="Make", data=FT_data)
#Labelling and aesthetics
plt.title('Group bar plot of Fuel Type vs MSRP', fontsize=18, fontweight= 'bold')
ax.set_xlabel('Engine Fuel Type',fontsize=14)
ax.set_ylabel('MSRP',fontsize=14)
plt.xticks(fontsize=10)
plt.show()
#Preparing data
sorted_pop = Brand_MSRP.sort_values(by= ['Popularity'],inplace=False)
brand = sorted_pop['Brand']
popularity = sorted_pop['Popularity']
#Plotting of graph
plt.figure(figsize=(16, 4))
plt.bar(brand, popularity, width=0.7, color='#db7530', alpha=0.9)
#Labelling and aesthetics
plt.title('Brand Vs Popularity', fontsize=18, fontweight= 'bold')
plt.xticks(rotation = 90, fontsize=8)
plt.xlabel('Brand')
plt.ylabel('Popularity')
plt.show()
xaxislabels = sorted_msrp['Brand']
#Plotting
fig, ax1 = plt.subplots(figsize=(20,10)) # Using subplot function
ax2 = ax1.twinx() # Mirror the y-axis, making a secondary axis
ax1.bar(xaxislabels, sorted_msrp['Quantity Sold'], alpha = 0.8, color = 'b', label ='Quantity of Cars Sold')
ax2.plot(xaxislabels, sorted_msrp['Average MSRP'], marker = 'x', c = 'r', label ='Average MSRP of Brand' )
#Labelling and aesthetics
ax1.set_xlabel('Car Brand', fontsize = 12)
ax1.set_ylabel('Number of cars sold in sample' , fontsize = 12)
ax2.set_ylabel('Average MSRP of Car Brand', fontsize = 12, rotation = 270)
plt.title('Average MSRP and quantity sold for each brand', fontsize=18, fontweight= 'bold')
plt.setp(ax1.xaxis.get_majorticklabels(), rotation = 90) # Setting the xtick labels to rotate 90 degrees
fig.legend(loc='upper center')
plt.show()